01. Imports and Inputs

In [265]:
import reading_adjusting as ra

import pandas as pd
import numpy as np
import os

import sklearn
import xgboost as xgb
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials, space_eval
from sklearn.metrics import mean_absolute_error as mae
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

import shap
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from plotly.offline import init_notebook_mode,  plot
init_notebook_mode(connected=False)
In [219]:
root= 'C:/Users/PauloAlves/Documents/Pessoais/MercadoLibre/'

02. Reading

In [220]:
final_tv_id = pd.read_csv(root + "01. Data/dados_tv_id.csv", index_col = 0)
final_tv = pd.read_csv(root + "01. Data/dados_tv.csv", index_col = 0)

03. Model

03.A - Data Cleansing and Feature Engineering

In [221]:
final_tv_id.shape
Out[221]:
(688, 191)
In [222]:
## Deleting variables with >80% of missings
t = final_tv_id.isnull().sum()/final_tv_id.shape[0]
final_t = t[t < 0.8]
final_tv_id_nonull = final_tv_id[final_t.index]
In [223]:
## Selecting only variables related to the product

names = ['__id','__seller_id','__price','__base_price',
 '__initial_quantity','__available_quantity','__sold_quantity','__listing_type_id','__start_time','__condition','__accepts_mercadopago',
 '__shipping__mode','__shipping__tags','__shipping__local_pick_up','__shipping__free_shipping','__shipping__logistic_type','__shipping__store_pick_up','__status','__tags',
 '__warranty','__catalog_listing','BRAND','MODEL','ACCESSORIES_INCLUDED','ASPECT_RATIO','DEPTH',
 'DISPLAY_SIZE','DISPLAY_TYPE','GTIN','HDMI_PORTS_NUMBER','HEIGHT','INTEGRATED_APPS','INTEGRATED_VIRTUAL_ASSISTANTS','IS_SMART',
 'ITEM_CONDITION','MAX_RESOLUTION','MAX_SPEAKERS_POWER','OPERATIVE_SYSTEM','RESOLUTION_TYPE','SELLER_SKU','SOUND_MODES','SPEAKERS_NUMBER',
 'USB_PORTS_NUMBER','VOLTAGE','WEIGHT','WIDTH','WITH_BLUETOOTH','WITH_ETHERNET','WITH_HDMI','WITH_HDR',
 'WITH_INTEGRATED_VOICE_COMMAND','WITH_USB','WITH_WI_FI','pictures','COLOR','IS_3D','IS_CURVED','IS_PORTABLE',
 'LINE','WITH_AUTO_POWER_OFF','WITH_SCREEN_SHARE_FUNCTION','CONTRAST_RATIO','PROCESSOR_CORES_NUMBER','WITH_NETFLIX','WITH_WEB_BROWSER','WITH_YOUTUBE']
final_tv_id = final_tv_id[list(names)]
final_tv_id.shape
Out[223]:
(688, 66)
In [224]:
# converting list variables into multilple columns

# transforming shipping tags into multiple columns
final_tv_id = ra.list_to_dataframe(df = final_tv_id, var_name = '__shipping__tags', prefix = 'shipping')

# transforming tags into multiple columns
final_tv_id = ra.list_to_dataframe(df = final_tv_id, var_name = '__tags', prefix = 'product')
In [225]:
# Creating seller dataframe
seller_table = final_tv[[col for col in final_tv.columns if col.startswith("seller")]].copy()

# transforming seller tag variables into multiple columns
seller_table = ra.list_to_dataframe(df = seller_table, var_name = 'seller__tags', prefix = 'seller')
In [226]:
# Selecting some of the seller columns

seller_columns = ['seller__id','seller__seller_reputation__transactions__total','seller__seller_reputation__transactions__canceled','seller__seller_reputation__transactions__ratings__negative',
 'seller__seller_reputation__transactions__ratings__positive','seller__seller_reputation__transactions__ratings__neutral','seller__seller_reputation__transactions__completed','seller__seller_reputation__power_seller_status',
 'seller__seller_reputation__metrics__claims__rate','seller__seller_reputation__metrics__delayed_handling_time__rate','seller__seller_reputation__metrics__sales__completed','seller__seller_reputation__metrics__cancellations__rate',
 'seller__seller_reputation__level_id','seller_normal','seller_user_info_verified','seller_credits_priority_4',
 'seller_eshop','seller_mshops','seller_developer','seller_credits_profile',
 'seller_messages_as_seller','seller_messages_as_buyer','seller_brand','seller_large_seller',
 'seller_medium_seller','seller_credits_priority_3','seller_medium_seller_advanced','seller_credits_priority_2',
 'seller_credits_priority_1','seller_from_facebook','seller_credits_active_borrower','seller_credits_open_market']

# Dropping duplicates sellers informations
seller_table = seller_table[seller_columns].drop_duplicates(subset = 'seller__id')
In [227]:
# Merging seller information into TV dataset
final_tv_id = final_tv_id.merge(seller_table, left_on = ['__seller_id'], right_on = ['seller__id'] )
In [228]:
final_tv_id.shape
Out[228]:
(688, 122)
In [229]:
# Transforming column "ACCESSORIES_INCLUDED" into multiple columns

final_tv_id.loc[:, 'ACCESSORIES_INCLUDED'] = final_tv_id.loc[:, 'ACCESSORIES_INCLUDED'].fillna("No Information")
final_tv_id = ra.str_to_dataframe(df = final_tv_id, var_name = 'ACCESSORIES_INCLUDED' , prefix = 'accessories')
In [230]:
# Transforming column "'INTEGRATED_APPS' into multiple columns

final_tv_id.loc[:, 'INTEGRATED_APPS'] = final_tv_id.loc[:, 'INTEGRATED_APPS'].fillna("No Information")
final_tv_id = ra.str_to_dataframe(df = final_tv_id, var_name = 'INTEGRATED_APPS' , prefix = 'apps')
final_tv_id.drop(['WITH_NETFLIX','WITH_WEB_BROWSER','WITH_YOUTUBE'], axis = 1, inplace = True)
In [231]:
# changing index to be the "id" column
final_tv_id.index = final_tv_id['__id']
final_tv_id.drop(['__id'], axis = 1, inplace = True)
In [232]:
# inserting the value "No Information" to all object/category columns
str_columns = final_tv_id.select_dtypes(include = ['object','category']).columns
final_tv_id[str_columns] = final_tv_id[str_columns].fillna("No Information")
In [233]:
# dropping some other columns that appears to be not usefull
final_tv_id.drop(['__start_time','MODEL','GTIN','ITEM_CONDITION','SELLER_SKU','LINE','seller__id','__seller_id'], axis = 1, inplace = True)
In [234]:
# Creating variable 'Percent_Information'. Pecentage of attributes columns with useful information (not null)

attributes_columns = [ 'ASPECT_RATIO','DEPTH',
 'DISPLAY_SIZE','DISPLAY_TYPE','HDMI_PORTS_NUMBER','HEIGHT','INTEGRATED_VIRTUAL_ASSISTANTS','IS_SMART','MAX_RESOLUTION','MAX_SPEAKERS_POWER',
 'OPERATIVE_SYSTEM','RESOLUTION_TYPE','SOUND_MODES','SPEAKERS_NUMBER','USB_PORTS_NUMBER','VOLTAGE','WEIGHT','WIDTH',
 'WITH_BLUETOOTH','WITH_ETHERNET','WITH_HDMI','WITH_HDR','WITH_INTEGRATED_VOICE_COMMAND','WITH_USB','WITH_WI_FI','pictures',
 'COLOR','IS_3D','IS_CURVED','IS_PORTABLE','WITH_AUTO_POWER_OFF','WITH_SCREEN_SHARE_FUNCTION','CONTRAST_RATIO','PROCESSOR_CORES_NUMBER']

final_tv_id['Percent_Information'] = (final_tv_id[attributes_columns] != 'No Information').sum(axis = 1)/len(attributes_columns)
In [235]:
# Creating variable 'Percent_Apps'. Pecentage of apps columns with useful information (not null)
app_columns = [c for c in final_tv_id.columns if c.startswith('apps_')]
final_tv_id['Percent_Apps'] = (final_tv_id[app_columns] == 1).sum(axis = 1)/len(app_columns)
In [236]:
# Creating variable 'Percent_Accessories'. Pecentage of accesspries columns with useful information (not null)

accessories_columns =[c for c in final_tv_id.columns if c.startswith('accessories_')]
final_tv_id['Percent_Accessories'] = (final_tv_id[accessories_columns] == 1).sum(axis = 1)/len(accessories_columns)
In [237]:
# Transforming TV dimensions from text to float (centimeters)

### DEPTH from text to float
final_tv_id['DEPTH'] = final_tv_id['DEPTH'].str.replace(" cm","").replace("No Information", np.nan)

mask_mm =  (final_tv_id['DEPTH'].str.find(" mm") > 0)
final_tv_id.loc[ mask_mm,['DEPTH']] = final_tv_id.loc[ mask_mm,'DEPTH'].str.replace(" mm","").astype(float)/10

mask_po =  (final_tv_id['DEPTH'].str.find(' "') > 0)
final_tv_id.loc[ mask_po,['DEPTH']] = final_tv_id.loc[ mask_po,'DEPTH'].str.replace(' "',"").astype(float)*2.54

final_tv_id['DEPTH'] = final_tv_id['DEPTH'].astype(float)
In [238]:
### HEIGHT from text to float (centimeters)
final_tv_id['HEIGHT'] = final_tv_id['HEIGHT'].str.replace(" cm","").replace("No Information", np.nan)

mask_mm =  (final_tv_id['HEIGHT'].str.find(" mm") > 0)
final_tv_id.loc[ mask_mm,['HEIGHT']] = final_tv_id.loc[ mask_mm,'HEIGHT'].str.replace(" mm","").astype(float)/10

mask_po =  (final_tv_id['HEIGHT'].str.find(' "') > 0)
final_tv_id.loc[ mask_po,['HEIGHT']] = final_tv_id.loc[ mask_po,'HEIGHT'].str.replace(' "',"").astype(float)*2.54

final_tv_id['HEIGHT'] = final_tv_id['HEIGHT'].astype(float)
In [239]:
### WEIGHT from text to float (Kg)
final_tv_id['WEIGHT'] = final_tv_id['WEIGHT'].str.replace(" kg","").replace("No Information", np.nan)

mask_lb =  (final_tv_id['WEIGHT'].str.find(" lb") > 0)
final_tv_id.loc[ mask_lb,['WEIGHT']] = final_tv_id.loc[ mask_lb,'WEIGHT'].str.replace(" lb","").astype(float) * 0.453592
final_tv_id['WEIGHT'] = final_tv_id['WEIGHT'].astype(float)
In [240]:
### WIDTH from text to float (centimeters)
final_tv_id['WIDTH'] = final_tv_id['WIDTH'].str.replace(" cm","").replace("No Information", np.nan)

mask_mm =  (final_tv_id['WIDTH'].str.find(" mm") > 0)
final_tv_id.loc[ mask_mm,['WIDTH']] = final_tv_id.loc[ mask_mm,'WIDTH'].str.replace(" mm","").astype(float)/10

mask_po =  (final_tv_id['WIDTH'].str.find(' "') > 0)
final_tv_id.loc[ mask_po,['WIDTH']] = final_tv_id.loc[ mask_po,'WIDTH'].str.replace(' "',"").astype(float)*2.54

final_tv_id['WIDTH'] = final_tv_id['WIDTH'].astype(float)
In [241]:
### Display from text to float
final_tv_id['DISPLAY_SIZE'] = final_tv_id['DISPLAY_SIZE'].str.replace(' "',"").replace("No Information", np.nan)
final_tv_id['DISPLAY_SIZE'] = final_tv_id['DISPLAY_SIZE'].str.replace(' in',"")
final_tv_id['DISPLAY_SIZE'] = final_tv_id['DISPLAY_SIZE'].str.replace(' pulgadas',"")
final_tv_id['DISPLAY_SIZE'] = final_tv_id['DISPLAY_SIZE'].astype(float)
In [242]:
### MAX_SPEAKERS_POWER from text to float (centimeters)
final_tv_id['MAX_SPEAKERS_POWER'] = final_tv_id['MAX_SPEAKERS_POWER'].str.replace(" W","").replace("No Information", np.nan)

final_tv_id['MAX_SPEAKERS_POWER'] = final_tv_id['MAX_SPEAKERS_POWER'].astype(float)
In [243]:
final_tv_id[['SPEAKERS_NUMBER','USB_PORTS_NUMBER','PROCESSOR_CORES_NUMBER','HDMI_PORTS_NUMBER']].dtypes
Out[243]:
SPEAKERS_NUMBER           float64
USB_PORTS_NUMBER          float64
PROCESSOR_CORES_NUMBER    float64
HDMI_PORTS_NUMBER         float64
dtype: object

03.B Model Preparations

In [244]:
# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(final_tv_id.loc[:,~final_tv_id.columns.isin(["__sold_quantity"])], final_tv_id ["__sold_quantity"], random_state = 123)

# Applying one-hot encoder in all categorical columns
ohe = OneHotEncoder(handle_unknown = 'ignore', sparse = False)

str_columns = X_train.select_dtypes(include = ['object','category']).columns
ohe.fit(X_train[str_columns])

# all original columns are deleted from both dataframes
X_train[ohe.get_feature_names()] = pd.DataFrame( ohe.transform(X_train[str_columns]), index = X_train.index, columns = ohe.get_feature_names())
X_train.drop(str_columns, axis = 1, inplace = True)

X_test[ohe.get_feature_names()] = pd.DataFrame( ohe.transform(X_test[str_columns]), index = X_test.index, columns = ohe.get_feature_names())
X_test.drop(str_columns, axis = 1, inplace = True)

03.C Finding best XGBoost configuration with Hyperopt

In [246]:
iterations = 1000
params = {
            "eta"             : hp.uniform("eta"             , 0.1, 0.5),
            "gamma"          : hp.uniform("gamma"            , 0  , 0.3),
            "max_depth"       : hp.randint("max_depth "      , 2, 6),
            "min_child_weight": hp.randint("min_child_weight", 1  , int(X_train.shape[0]*0.1) ),
            "subsample"       : hp.uniform("subsample"       , 0.5, 1),
            "sampling_method" : hp.choice( "sampling_method" ,['uniform','gradient_based']),
            "colsample_bytree": hp.uniform("colsample_bytree", 0.1, 0.9),
            "lambda"          : hp.uniform("lambda"          , 0.5  , 3),
            "alpha"           : hp.uniform("alpha"           , 0.5  , 3),
            "n_estimators"    : hp.randint("n_estimators"    , 30  , 100)
        }

def f_nn(params):
    xgb_model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)
    xgb_model.set_params(**params)
    xgb_model.fit(X_train, y_train, early_stopping_rounds=10,eval_metric="mae", eval_set=[(X_test, y_test)], verbose = False)
    acc = mae(y_test, xgb_model.predict(X_test))
    return {"loss": acc, "status": STATUS_OK}

trials = Trials()
from functools import partial

algo = partial(
    tpe.suggest,
    n_startup_jobs=int(iterations * 0.3),
    gamma=0.25,
    n_EI_candidates=200,
)

best = fmin(f_nn, params, algo=algo, max_evals=iterations, trials=trials)
100%|██████████████████████████████████████████████| 1000/1000 [14:30<00:00,  1.15trial/s, best loss: 7.32598485970913]
In [251]:
best_params = space_eval(params, best)
best_params
Out[251]:
{'alpha': 0.8942075120229254,
 'colsample_bytree': 0.7550682305708544,
 'eta': 0.16357940377932206,
 'gamma': 0.20765039452357484,
 'lambda': 1.0951983513137367,
 'max_depth': 3,
 'min_child_weight': 5,
 'n_estimators': 85,
 'sampling_method': 'uniform',
 'subsample': 0.9341018777329546}

03.D - Recursive Feature Extraction

In [252]:
step = 20
columns = X_train.columns
acc = []
n_poins = {}
n = len(columns)
while n > 0:
    xgb_model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)
    xgb_model.set_params(**best_params)
    xgb_model.fit(X_train[columns], y_train, early_stopping_rounds=10,eval_metric="mae", eval_set=[(X_test[columns], y_test)], verbose = False)
    acc.append(mae(y_test, xgb_model.predict(X_test[columns])))
    n_poins[n] = columns
    
    fi = pd.DataFrame(xgb_model.feature_importances_, index = X_train[columns].columns).sort_values(0,ascending = False)
    columns = fi.iloc[ 0: (fi.shape[0] - step),:].index
    n = len(columns)
    
In [253]:
pd.DataFrame({"Number_columns": list(n_poins.keys()),"MAE":acc})
Out[253]:
Number_columns MAE
0 356 7.325985
1 336 8.578941
2 316 8.295134
3 296 8.393082
4 276 8.170530
5 256 8.016501
6 236 8.717547
7 216 8.439569
8 196 8.227950
9 176 7.664289
10 156 8.199388
11 136 8.216766
12 116 8.478530
13 96 8.288501
14 76 7.833954
15 56 8.341686
16 36 8.187076
17 16 7.201591
18 12 7.643466
19 4 8.759955

03.E Training with best configuration and number of variables

In [254]:
columns = n_poins[16] #sixteen columns
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)
xgb_model.set_params(**best_params)
eval_set = [(X_train[columns], y_train), (X_test[columns], y_test)]
xgb_model.fit(X_train[columns], y_train, early_stopping_rounds=10,eval_metric=["mae",'rmse'], eval_set=eval_set, verbose = False)
Out[254]:
XGBRegressor(alpha=0.8942075120229254, colsample_bytree=0.7550682305708544,
             eta=0.16357940377932206, gamma=0.20765039452357484,
             lambda=1.0951983513137367, min_child_weight=5, n_estimators=85,
             objective='reg:squarederror', random_state=42,
             sampling_method='uniform', subsample=0.9341018777329546)
In [255]:
results = xgb_model.evals_result()
epochs = len(results['validation_0']['mae'])
x_axis = range(0, epochs)
In [256]:
# Mean absolute Error
fig, ax = plt.subplots()
ax.plot(x_axis, results['validation_0']['mae'], label='Train')
ax.plot(x_axis, results['validation_1']['mae'], label='Test')
ax.legend()
plt.ylabel('MAE')
plt.xlabel('Epoch')
plt.title('XGBoost - Mean Absolute Error (MAE)')
plt.show()
In [257]:
# Root Mean Squared Error
fig, ax = plt.subplots()
ax.plot(x_axis, results['validation_0']['rmse'], label='Train')
ax.plot(x_axis, results['validation_1']['rmse'], label='Test')
ax.legend()
plt.ylabel('RMSE')
plt.xlabel("Epoch")
plt.title('XGBoost - Root Mean Squared Error')
plt.show()
In [258]:
# training with less epochs: 35 .. reducing the gap between train and test

best_params['n_estimators'] = 35
columns = n_poins[16]
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)
xgb_model.set_params(**best_params)
eval_set = [(X_train[columns], y_train), (X_test[columns], y_test)]
xgb_model.fit(X_train[columns], y_train, early_stopping_rounds=10,eval_metric=["mae",'rmse'], eval_set=eval_set, verbose = False)
Out[258]:
XGBRegressor(alpha=0.8942075120229254, colsample_bytree=0.7550682305708544,
             eta=0.16357940377932206, gamma=0.20765039452357484,
             lambda=1.0951983513137367, min_child_weight=5, n_estimators=35,
             objective='reg:squarederror', random_state=42,
             sampling_method='uniform', subsample=0.9341018777329546)
In [260]:
# Root Mean Squared Error
columns = n_poins[16]

predict_trt = xgb_model.predict(X_train[columns])
predict_tst = xgb_model.predict(X_test[columns])
error_tst = predict_tst - y_test
error_trt = predict_trt - y_train

fig = go.Figure(go.Histogram(x=error_trt
                             ,histnorm='percent'
                             ,name = "Train"
                             ,opacity = 0.5
                            , xbins=dict( # bins used for histogram
                                            start=-500.0,
                                            end=50.0,
                                            size=5
                                        )
                            )
               )


fig.add_trace(go.Histogram(x=error_tst
                           ,histnorm='percent'
                           ,name = "Test"
                           ,opacity = 0.5
                           , xbins=dict( # bins used for histogram
                                            start=-500.0,
                                            end=50.0,
                                            size=5
                                        )
                            )
               )

fig.update_layout(yaxis=dict(title='Porcentaje'),
                  xaxis=dict(title="Error"),
                 title = 'Distribuicion de los erros')
fig.update_layout(barmode = 'overlay')
fig.show()

03.F - Model Explainability with Shap

In [261]:
explainer = shap.Explainer(xgb_model)
shap_values = explainer(X_train[columns])
In [262]:
# Feature Importance
shap.plots.bar(shap_values)
In [263]:
# visualize the first prediction's explanation
shap.plots.waterfall(shap_values[0])
In [264]:
# summarize the effects of all the features
shap.plots.beeswarm(shap_values)
In [ ]: